{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import numpy as np\n",
    "import pandas as pd \n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "from lightgbm.sklearn import LGBMClassifier\n",
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.svm import SVC\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#set data path\n",
    "train_x_csv = \"./data/train_x.csv\"\n",
    "train_y_csv = \"./data/train_y.csv\"\n",
    "test_x_csv = \"./data/test_x.csv\"\n",
    "features_type_csv = \"./data/features_type.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#load data\n",
    "train_x = pd.read_csv(train_x_csv)\n",
    "train_y = pd.read_csv(train_y_csv)\n",
    "train_xy = pd.merge(train_x,train_y,on='uid')\n",
    "\n",
    "test = pd.read_csv(test_x_csv)\n",
    "test_uid = test.uid\n",
    "test_x = test.drop(['uid'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = train_x.drop(['uid'],axis=1)\n",
    "y = train_y.drop(['uid'],axis=1)\n",
    "X_submission = test_x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(0)  # seed to shuffle the train set\n",
    "\n",
    "n_folds = 10\n",
    "verbose = True\n",
    "shuffle = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "if shuffle:\n",
    "        idx = np.random.permutation(y.size)\n",
    "        X = X[idx]\n",
    "        y = y[idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "sfolder = StratifiedKFold(n_splits=5,random_state=0,shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "bst = RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "bst2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "bst3 = LGBMClassifier(\n",
    "        num_leaves = 7,\n",
    "        objective =  'binary',\n",
    "        max_depth = 7,\n",
    "        learning_rate = 0.1,\n",
    "        max_bin = 200,\n",
    "        n_estimators = 85)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "bst4 = XGBClassifier(\n",
    "        learning_rate =0.1,\n",
    "        n_estimators=2000,  \n",
    "        max_depth=8,\n",
    "        min_child_weight=4,\n",
    "        gamma=0,\n",
    "        subsample=0.8,\n",
    "        colsample_bytree=0.9,\n",
    "        colsample_bylevel = 0.7,\n",
    "        objective= 'binary:logistic',\n",
    "        reg_alpha = 2,\n",
    "        reg_lambda = 0.1,\n",
    "        n_jobs = -1,\n",
    "        seed=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "bst5 = SVC()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "clfs = [bst, bst2,bst3,bst4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_blend_train = np.zeros((X.shape[0], len(clfs)))\n",
    "dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "skf = list(sfolder.split(X, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
      "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
      "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
      "            min_samples_leaf=1, min_samples_split=2,\n",
      "            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n",
      "            oob_score=False, random_state=None, verbose=0,\n",
      "            warm_start=False)\n",
      "Fold0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold4\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
      "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
      "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
      "            min_samples_leaf=1, min_samples_split=2,\n",
      "            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n",
      "            oob_score=False, random_state=None, verbose=0,\n",
      "            warm_start=False)\n",
      "Fold0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold4\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  del sys.path[0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n",
      "        importance_type='split', learning_rate=0.1, max_bin=200,\n",
      "        max_depth=7, min_child_samples=20, min_child_weight=0.001,\n",
      "        min_split_gain=0.0, n_estimators=85, n_jobs=-1, num_leaves=7,\n",
      "        objective='binary', random_state=None, reg_alpha=0.0,\n",
      "        reg_lambda=0.0, silent=True, subsample=1.0,\n",
      "        subsample_for_bin=200000, subsample_freq=0)\n",
      "Fold0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:219: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n",
      "/home/ml/miniconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py:252: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fold1\n",
      "Fold2\n",
      "Fold3\n",
      "Fold4\n",
      "3 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,\n",
      "       colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
      "       max_depth=8, min_child_weight=4, missing=None, n_estimators=2000,\n",
      "       n_jobs=-1, nthread=None, objective='binary:logistic',\n",
      "       random_state=0, reg_alpha=2, reg_lambda=0.1, scale_pos_weight=1,\n",
      "       seed=3, silent=True, subsample=0.8)\n",
      "Fold0\n",
      "Fold1\n",
      "Fold2\n",
      "Fold3\n",
      "Fold4\n"
     ]
    }
   ],
   "source": [
    "for j, clf in enumerate(clfs):\n",
    "    print(j, clf)\n",
    "    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))\n",
    "    for i, (train, test) in enumerate(skf):\n",
    "        \n",
    "        print(\"Fold\" + str(i))\n",
    "        X_train = X.loc[train]\n",
    "        y_train = y.loc[train]\n",
    "        X_test = X.loc[test]\n",
    "        y_test = y.loc[test]\n",
    "        \n",
    "        \n",
    "        clf.fit(X_train, y_train)\n",
    "        y_submission = clf.predict_proba(X_test)[:, 1]\n",
    "        dataset_blend_train[test, j] = y_submission\n",
    "        \n",
    "        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]\n",
    "dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ml/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "/home/ml/miniconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    }
   ],
   "source": [
    "clf = LogisticRegression(C=0.01)\n",
    "#clf = LogisticRegression(C=0.01)\n",
    "clf.fit(dataset_blend_train, y)\n",
    "y_submission = clf.predict_proba(dataset_blend_test)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.65113957, 0.64636073, 0.64572003, ..., 0.65034395, 0.64924036,\n",
       "       0.65020666])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_result = pd.DataFrame(columns=[\"uid\",\"score\"])\n",
    "test_result.uid = test_uid\n",
    "test_result.score = y_submission\n",
    "test_result.to_csv(\"test_predict_result.csv\",index=None,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
